# encoding: UTF-8
# user：admin at 2018/3/21

from os import path
import sys
import jieba
import jieba.analyse

from wordcloud import WordCloud,STOPWORDS

import matplotlib.pyplot as plt

stopwords={}

def importStopword(filename=''):
    global stopwords
    f=open(filename,'r',encoding='utf-8')
    line=f.readline().rstrip()

    while line:
        stopwords.setdefault(line,0)
        stopwords[line]=1
        line=f.readline().rstrip()
    f.close()
    print(len(stopwords))

def processChinese(text):
    seg_list=[]
    #jieba.analyse.set_stop_words('./extra_dict/stopwords.txt')
    seg_generator=jieba.cut(text)
    #print(seg_generator)
    seg_list=[str_seg for str_seg in seg_generator if str_seg not in stopwords]

    #print(stopwords)
    seg_list=[i for i in seg_list if i !=u' ']
    seg_list = [i for i in seg_list if i != u'\n']

    seg_list=r' '.join(seg_list)
    return seg_list

#提取关键字
def anlyseKey(text,N=25,is_idf=False):
    jieba.analyse.set_stop_words('./extra_dict/stopwords.txt')
    if is_idf:
        jieba.analyse.set_idf_path('../extra_dict/idf.txt.big')

    tags=jieba.analyse.extract_tags(text,topK=N)

    tags=r' '.join(tags)
    return tags

if __name__=='__main__':
    importStopword(filename='./extra_dict/stopwords.txt')
    d=path.dirname(__file__)
    #2018政府工作报告.txt
    context=open(path.join(d,u'./words/2018政府工作报告.txt'),encoding='utf-8').read().strip('\n')
    #context = open(path.join(d, u'./words/2018政府工作报告.txt'), encoding='utf-8').read().strip('\n')
    text=processChinese(context)
    print(text)

    tags=anlyseKey(context,N=25)
    print('******')
    print('2018政府工作报告中用的最多的25个词语：')
    print(tags)